/*-------------------------------------------------------------------------
 *
 * encnames.c
 *	  Encoding names and routines for working with them.
 *
 * Portions Copyright (c) 2001-2020, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/encnames.c
 *
 *-------------------------------------------------------------------------
 */
#include "c.h"

#include <ctype.h>
#include <unistd.h>

#include "mb/pg_wchar.h"


/* ----------
 * All encoding names, sorted:		 *** A L P H A B E T I C ***
 *
 * All names must be without irrelevant chars, search routines use
 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
 * are always converted to 'iso88591'. All must be lower case.
 *
 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
 *
 * Karel Zak, Aug 2001
 * ----------
 */
typedef struct pg_encname
{
    const char *name;
    pg_enc		encoding;
} pg_encname;

static const pg_encname pg_encname_tbl[] =
        {
                {
                        "abc", PG_WIN1258
                },							/* alias for WIN1258 */
                {
                        "alt", PG_WIN866
                },							/* IBM866 */
                {
                        "big5", PG_BIG5
                },							/* Big5; Chinese for Taiwan multibyte set */
                {
                        "euccn", PG_EUC_CN
                },							/* EUC-CN; Extended Unix Code for simplified
								 * Chinese */
                {
                        "eucjis2004", PG_EUC_JIS_2004
                },							/* EUC-JIS-2004; Extended UNIX Code fixed
								 * Width for Japanese, standard JIS X 0213 */
                {
                        "eucjp", PG_EUC_JP
                },							/* EUC-JP; Extended UNIX Code fixed Width for
								 * Japanese, standard OSF */
                {
                        "euckr", PG_EUC_KR
                },							/* EUC-KR; Extended Unix Code for Korean , KS
								 * X 1001 standard */
                {
                        "euctw", PG_EUC_TW
                },							/* EUC-TW; Extended Unix Code for
								 *
								 * traditional Chinese */
                {
                        "gb18030", PG_GB18030
                },							/* GB18030;GB18030 */
                {
                        "gbk", PG_GBK
                },							/* GBK; Chinese Windows CodePage 936
								 * simplified Chinese */
                {
                        "iso88591", PG_LATIN1
                },							/* ISO-8859-1; RFC1345,KXS2 */
                {
                        "iso885910", PG_LATIN6
                },							/* ISO-8859-10; RFC1345,KXS2 */
                {
                        "iso885913", PG_LATIN7
                },							/* ISO-8859-13; RFC1345,KXS2 */
                {
                        "iso885914", PG_LATIN8
                },							/* ISO-8859-14; RFC1345,KXS2 */
                {
                        "iso885915", PG_LATIN9
                },							/* ISO-8859-15; RFC1345,KXS2 */
                {
                        "iso885916", PG_LATIN10
                },							/* ISO-8859-16; RFC1345,KXS2 */
                {
                        "iso88592", PG_LATIN2
                },							/* ISO-8859-2; RFC1345,KXS2 */
                {
                        "iso88593", PG_LATIN3
                },							/* ISO-8859-3; RFC1345,KXS2 */
                {
                        "iso88594", PG_LATIN4
                },							/* ISO-8859-4; RFC1345,KXS2 */
                {
                        "iso88595", PG_ISO_8859_5
                },							/* ISO-8859-5; RFC1345,KXS2 */
                {
                        "iso88596", PG_ISO_8859_6
                },							/* ISO-8859-6; RFC1345,KXS2 */
                {
                        "iso88597", PG_ISO_8859_7
                },							/* ISO-8859-7; RFC1345,KXS2 */
                {
                        "iso88598", PG_ISO_8859_8
                },							/* ISO-8859-8; RFC1345,KXS2 */
                {
                        "iso88599", PG_LATIN5
                },							/* ISO-8859-9; RFC1345,KXS2 */
                {
                        "johab", PG_JOHAB
                },							/* JOHAB; Extended Unix Code for simplified
								 * Chinese */
                {
                        "koi8", PG_KOI8R
                },							/* _dirty_ alias for KOI8-R (backward
								 * compatibility) */
                {
                        "koi8r", PG_KOI8R
                },							/* KOI8-R; RFC1489 */
                {
                        "koi8u", PG_KOI8U
                },							/* KOI8-U; RFC2319 */
                {
                        "latin1", PG_LATIN1
                },							/* alias for ISO-8859-1 */
                {
                        "latin10", PG_LATIN10
                },							/* alias for ISO-8859-16 */
                {
                        "latin2", PG_LATIN2
                },							/* alias for ISO-8859-2 */
                {
                        "latin3", PG_LATIN3
                },							/* alias for ISO-8859-3 */
                {
                        "latin4", PG_LATIN4
                },							/* alias for ISO-8859-4 */
                {
                        "latin5", PG_LATIN5
                },							/* alias for ISO-8859-9 */
                {
                        "latin6", PG_LATIN6
                },							/* alias for ISO-8859-10 */
                {
                        "latin7", PG_LATIN7
                },							/* alias for ISO-8859-13 */
                {
                        "latin8", PG_LATIN8
                },							/* alias for ISO-8859-14 */
                {
                        "latin9", PG_LATIN9
                },							/* alias for ISO-8859-15 */
                {
                        "mskanji", PG_SJIS
                },							/* alias for Shift_JIS */
                {
                        "muleinternal", PG_MULE_INTERNAL
                },
                {
                        "shiftjis", PG_SJIS
                },							/* Shift_JIS; JIS X 0202-1991 */

                {
                        "shiftjis2004", PG_SHIFT_JIS_2004
                },							/* SHIFT-JIS-2004; Shift JIS for Japanese,
								 * standard JIS X 0213 */
                {
                        "sjis", PG_SJIS
                },							/* alias for Shift_JIS */
                {
                        "sqlascii", PG_SQL_ASCII
                },
                {
                        "tcvn", PG_WIN1258
                },							/* alias for WIN1258 */
                {
                        "tcvn5712", PG_WIN1258
                },							/* alias for WIN1258 */
                {
                        "uhc", PG_UHC
                },							/* UHC; Korean Windows CodePage 949 */
                {
                        "unicode", PG_UTF8
                },							/* alias for UTF8 */
                {
                        "utf8", PG_UTF8
                },							/* alias for UTF8 */
                {
                        "vscii", PG_WIN1258
                },							/* alias for WIN1258 */
                {
                        "win", PG_WIN1251
                },							/* _dirty_ alias for windows-1251 (backward
								 * compatibility) */
                {
                        "win1250", PG_WIN1250
                },							/* alias for Windows-1250 */
                {
                        "win1251", PG_WIN1251
                },							/* alias for Windows-1251 */
                {
                        "win1252", PG_WIN1252
                },							/* alias for Windows-1252 */
                {
                        "win1253", PG_WIN1253
                },							/* alias for Windows-1253 */
                {
                        "win1254", PG_WIN1254
                },							/* alias for Windows-1254 */
                {
                        "win1255", PG_WIN1255
                },							/* alias for Windows-1255 */
                {
                        "win1256", PG_WIN1256
                },							/* alias for Windows-1256 */
                {
                        "win1257", PG_WIN1257
                },							/* alias for Windows-1257 */
                {
                        "win1258", PG_WIN1258
                },							/* alias for Windows-1258 */
                {
                        "win866", PG_WIN866
                },							/* IBM866 */
                {
                        "win874", PG_WIN874
                },							/* alias for Windows-874 */
                {
                        "win932", PG_SJIS
                },							/* alias for Shift_JIS */
                {
                        "win936", PG_GBK
                },							/* alias for GBK */
                {
                        "win949", PG_UHC
                },							/* alias for UHC */
                {
                        "win950", PG_BIG5
                },							/* alias for BIG5 */
                {
                        "windows1250", PG_WIN1250
                },							/* Windows-1251; Microsoft */
                {
                        "windows1251", PG_WIN1251
                },							/* Windows-1251; Microsoft */
                {
                        "windows1252", PG_WIN1252
                },							/* Windows-1252; Microsoft */
                {
                        "windows1253", PG_WIN1253
                },							/* Windows-1253; Microsoft */
                {
                        "windows1254", PG_WIN1254
                },							/* Windows-1254; Microsoft */
                {
                        "windows1255", PG_WIN1255
                },							/* Windows-1255; Microsoft */
                {
                        "windows1256", PG_WIN1256
                },							/* Windows-1256; Microsoft */
                {
                        "windows1257", PG_WIN1257
                },							/* Windows-1257; Microsoft */
                {
                        "windows1258", PG_WIN1258
                },							/* Windows-1258; Microsoft */
                {
                        "windows866", PG_WIN866
                },							/* IBM866 */
                {
                        "windows874", PG_WIN874
                },							/* Windows-874; Microsoft */
                {
                        "windows932", PG_SJIS
                },							/* alias for Shift_JIS */
                {
                        "windows936", PG_GBK
                },							/* alias for GBK */
                {
                        "windows949", PG_UHC
                },							/* alias for UHC */
                {
                        "windows950", PG_BIG5
                }							/* alias for BIG5 */
        };

/* ----------
 * These are "official" encoding names.
 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
 * ----------
 */
#ifndef WIN32
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
#else
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
#endif

const pg_enc2name pg_enc2name_tbl[] =
        {
                DEF_ENC2NAME(SQL_ASCII, 0),
                DEF_ENC2NAME(EUC_JP, 20932),
                DEF_ENC2NAME(EUC_CN, 20936),
                DEF_ENC2NAME(EUC_KR, 51949),
                DEF_ENC2NAME(EUC_TW, 0),
                DEF_ENC2NAME(EUC_JIS_2004, 20932),
                DEF_ENC2NAME(UTF8, 65001),
                DEF_ENC2NAME(MULE_INTERNAL, 0),
                DEF_ENC2NAME(LATIN1, 28591),
                DEF_ENC2NAME(LATIN2, 28592),
                DEF_ENC2NAME(LATIN3, 28593),
                DEF_ENC2NAME(LATIN4, 28594),
                DEF_ENC2NAME(LATIN5, 28599),
                DEF_ENC2NAME(LATIN6, 0),
                DEF_ENC2NAME(LATIN7, 0),
                DEF_ENC2NAME(LATIN8, 0),
                DEF_ENC2NAME(LATIN9, 28605),
                DEF_ENC2NAME(LATIN10, 0),
                DEF_ENC2NAME(WIN1256, 1256),
                DEF_ENC2NAME(WIN1258, 1258),
                DEF_ENC2NAME(WIN866, 866),
                DEF_ENC2NAME(WIN874, 874),
                DEF_ENC2NAME(KOI8R, 20866),
                DEF_ENC2NAME(WIN1251, 1251),
                DEF_ENC2NAME(WIN1252, 1252),
                DEF_ENC2NAME(ISO_8859_5, 28595),
                DEF_ENC2NAME(ISO_8859_6, 28596),
                DEF_ENC2NAME(ISO_8859_7, 28597),
                DEF_ENC2NAME(ISO_8859_8, 28598),
                DEF_ENC2NAME(WIN1250, 1250),
                DEF_ENC2NAME(WIN1253, 1253),
                DEF_ENC2NAME(WIN1254, 1254),
                DEF_ENC2NAME(WIN1255, 1255),
                DEF_ENC2NAME(WIN1257, 1257),
                DEF_ENC2NAME(KOI8U, 21866),
                DEF_ENC2NAME(SJIS, 932),
                DEF_ENC2NAME(BIG5, 950),
                DEF_ENC2NAME(GBK, 936),
                DEF_ENC2NAME(UHC, 949),
                DEF_ENC2NAME(GB18030, 54936),
                DEF_ENC2NAME(JOHAB, 0),
                DEF_ENC2NAME(SHIFT_JIS_2004, 932)
        };

/* ----------
 * These are encoding names for gettext.
 *
 * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
 * ----------
 */
const pg_enc2gettext pg_enc2gettext_tbl[] =
        {
                {PG_SQL_ASCII, "US-ASCII"},
                {PG_UTF8, "UTF-8"},
                {PG_LATIN1, "LATIN1"},
                {PG_LATIN2, "LATIN2"},
                {PG_LATIN3, "LATIN3"},
                {PG_LATIN4, "LATIN4"},
                {PG_ISO_8859_5, "ISO-8859-5"},
                {PG_ISO_8859_6, "ISO_8859-6"},
                {PG_ISO_8859_7, "ISO-8859-7"},
                {PG_ISO_8859_8, "ISO-8859-8"},
                {PG_LATIN5, "LATIN5"},
                {PG_LATIN6, "LATIN6"},
                {PG_LATIN7, "LATIN7"},
                {PG_LATIN8, "LATIN8"},
                {PG_LATIN9, "LATIN-9"},
                {PG_LATIN10, "LATIN10"},
                {PG_KOI8R, "KOI8-R"},
                {PG_KOI8U, "KOI8-U"},
                {PG_WIN1250, "CP1250"},
                {PG_WIN1251, "CP1251"},
                {PG_WIN1252, "CP1252"},
                {PG_WIN1253, "CP1253"},
                {PG_WIN1254, "CP1254"},
                {PG_WIN1255, "CP1255"},
                {PG_WIN1256, "CP1256"},
                {PG_WIN1257, "CP1257"},
                {PG_WIN1258, "CP1258"},
                {PG_WIN866, "CP866"},
                {PG_WIN874, "CP874"},
                {PG_EUC_CN, "EUC-CN"},
                {PG_EUC_JP, "EUC-JP"},
                {PG_EUC_KR, "EUC-KR"},
                {PG_EUC_TW, "EUC-TW"},
                {PG_EUC_JIS_2004, "EUC-JP"},
                {PG_SJIS, "SHIFT-JIS"},
                {PG_BIG5, "BIG5"},
                {PG_GBK, "GBK"},
                {PG_UHC, "UHC"},
                {PG_GB18030, "GB18030"},
                {PG_JOHAB, "JOHAB"},
                {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
                {0, NULL}
        };


/*
 * Table of encoding names for ICU (currently covers backend encodings only)
 *
 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
 *
 * NULL entries are not supported by ICU, or their mapping is unclear.
 */
static const char *const pg_enc2icu_tbl[] =
        {
                NULL,						/* PG_SQL_ASCII */
                "EUC-JP",					/* PG_EUC_JP */
                "EUC-CN",					/* PG_EUC_CN */
                "EUC-KR",					/* PG_EUC_KR */
                "EUC-TW",					/* PG_EUC_TW */
                NULL,						/* PG_EUC_JIS_2004 */
                "UTF-8",					/* PG_UTF8 */
                NULL,						/* PG_MULE_INTERNAL */
                "ISO-8859-1",				/* PG_LATIN1 */
                "ISO-8859-2",				/* PG_LATIN2 */
                "ISO-8859-3",				/* PG_LATIN3 */
                "ISO-8859-4",				/* PG_LATIN4 */
                "ISO-8859-9",				/* PG_LATIN5 */
                "ISO-8859-10",				/* PG_LATIN6 */
                "ISO-8859-13",				/* PG_LATIN7 */
                "ISO-8859-14",				/* PG_LATIN8 */
                "ISO-8859-15",				/* PG_LATIN9 */
                NULL,						/* PG_LATIN10 */
                "CP1256",					/* PG_WIN1256 */
                "CP1258",					/* PG_WIN1258 */
                "CP866",					/* PG_WIN866 */
                NULL,						/* PG_WIN874 */
                "KOI8-R",					/* PG_KOI8R */
                "CP1251",					/* PG_WIN1251 */
                "CP1252",					/* PG_WIN1252 */
                "ISO-8859-5",				/* PG_ISO_8859_5 */
                "ISO-8859-6",				/* PG_ISO_8859_6 */
                "ISO-8859-7",				/* PG_ISO_8859_7 */
                "ISO-8859-8",				/* PG_ISO_8859_8 */
                "CP1250",					/* PG_WIN1250 */
                "CP1253",					/* PG_WIN1253 */
                "CP1254",					/* PG_WIN1254 */
                "CP1255",					/* PG_WIN1255 */
                "CP1257",					/* PG_WIN1257 */
                "KOI8-U",					/* PG_KOI8U */
        };


/*
 * Is this encoding supported by ICU?
 */
bool
is_encoding_supported_by_icu(int encoding)
{
    if (!PG_VALID_BE_ENCODING(encoding))
        return false;
    return (pg_enc2icu_tbl[encoding] != NULL);
}

/*
 * Returns ICU's name for encoding, or NULL if not supported
 */
const char *
get_encoding_name_for_icu(int encoding)
{
    StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
                     "pg_enc2icu_tbl incomplete");

    if (!PG_VALID_BE_ENCODING(encoding))
        return NULL;
    return pg_enc2icu_tbl[encoding];
}


/* ----------
 * Encoding checks, for error returns -1 else encoding id
 * ----------
 */
int
pg_valid_client_encoding(const char *name)
{
    int			enc;

    if ((enc = pg_char_to_encoding(name)) < 0)
        return -1;

    if (!PG_VALID_FE_ENCODING(enc))
        return -1;

    return enc;
}

int
pg_valid_server_encoding(const char *name)
{
    int			enc;

    if ((enc = pg_char_to_encoding(name)) < 0)
        return -1;

    if (!PG_VALID_BE_ENCODING(enc))
        return -1;

    return enc;
}

int
pg_valid_server_encoding_id(int encoding)
{
    return PG_VALID_BE_ENCODING(encoding);
}

/*
 * Remove irrelevant chars from encoding name, store at *newkey
 *
 * (Caller's responsibility to provide a large enough buffer)
 */
static char *
clean_encoding_name(const char *key, char *newkey)
{
    const char *p;
    char	   *np;

    for (p = key, np = newkey; *p != '\0'; p++)
    {
        if (isalnum((unsigned char) *p))
        {
            if (*p >= 'A' && *p <= 'Z')
                *np++ = *p + 'a' - 'A';
            else
                *np++ = *p;
        }
    }
    *np = '\0';
    return newkey;
}

/*
 * Search encoding by encoding name
 *
 * Returns encoding ID, or -1 if not recognized
 */
int
pg_char_to_encoding(const char *name)
{
    unsigned int nel = lengthof(pg_encname_tbl);
    const pg_encname *base = pg_encname_tbl,
            *last = base + nel - 1,
            *position;
    int			result;
    char		buff[NAMEDATALEN],
            *key;

    if (name == NULL || *name == '\0')
        return -1;

    if (strlen(name) >= NAMEDATALEN)
        return -1;				/* it's certainly not in the table */

    key = clean_encoding_name(name, buff);

    while (last >= base)
    {
        position = base + ((last - base) >> 1);
        result = key[0] - position->name[0];

        if (result == 0)
        {
            result = strcmp(key, position->name);
            if (result == 0)
                return position->encoding;
        }
        if (result < 0)
            last = position - 1;
        else
            base = position + 1;
    }
    return -1;
}

const char *
pg_encoding_to_char(int encoding)
{
    if (PG_VALID_ENCODING(encoding))
    {
        const pg_enc2name *p = &pg_enc2name_tbl[encoding];

        Assert(encoding == p->encoding);
        return p->name;
    }
    return "";
}